Analysis date: 2020-01-10

Setup

Load libraries

library(plyr)
library(gtools)
library(openxlsx)
library(pheatmap)
library(reshape2)
library(progress)
library(Matrix)
library(Hmisc)
library(lemon)
library(ggpubr)
library(effsize)
library(ggbeeswarm)
library(ggfortify)
library(ggpmisc)
library(ggrepel)
library(readxl)
library(DESeq2)
library(TOSTER)
library(tidyverse)
library(vsn)
library(fdrtool)
library(limma)
library(apeglm)
library(IHW)
library(Rtsne)
library(biomartr)
library(biomaRt)
library(MultiAssayExperiment)
library(PMA)
library(gplots)
library(RColorBrewer)
library(grid)
library(ConsensusClusterPlus)
library(survival)
library(survminer)
library(cowplot)

Set paths

mae_path <- "/Volumes/sd17b003/Sophie/Analysis/Screen_analysis/Robjects"
mae_path_Eva <- "/Volumes/sd17b003/Sophie/Eva/Master_thesis/data"
consensus_path <- "/Volumes/sd17b003/Sophie/Analysis/CLL_Proteomics"

Load data

#load(file.path(mae_path, "multiomics_MAE_full.RData"))
load(file.path(mae_path_Eva, "multiomics_MAE.RData"))
source("/Volumes/sd17b003/Sophie/Analysis/Screen_analysis/Figure_layouts.R")

Format data

Proteomics

prot_few_nas <- multiomics_MAE[["proteomics"]] %>% is.na() %>% rowSums()
prot_few_nas <- prot_few_nas[ prot_few_nas == 0 ] %>% names()

Var50 <- assay(multiomics_MAE[prot_few_nas , ,"proteomics"]) %>% rowVars(na.rm = TRUE) %>% sort(decreasing = TRUE) %>% .[50]
Var500 <- assay(multiomics_MAE[prot_few_nas , ,"proteomics"]) %>% rowVars(na.rm = TRUE) %>% sort(decreasing = TRUE) %>% .[500]
Var1000 <- assay(multiomics_MAE[prot_few_nas , ,"proteomics"]) %>% rowVars(na.rm = TRUE) %>% sort(decreasing = TRUE) %>% .[1000]

Biomart

Get annotation from biomart
#ensembl=useMart("ensembl")
#ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
#genemap <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "start_position", "end_position" , "chromosome_name", "description"), 
#    filters = "hgnc_symbol", values = (prot_few_nas %>% unique), mart = ensembl)
#genemap <- genemap %>% as_tibble() %>% mutate(mean_position=(start_position + end_position)/2)
load("/Volumes/sd17b003/Sophie/Analysis/CLL_Proteomics/R_objects/ensembl_proteins_location.RData")
Add gene location to MAE
metadata(multiomics_MAE)$gene_symbol_mapping <- left_join(metadata(multiomics_MAE)$gene_symbol_mapping, genemap)
Create proteomics_tbl_meta_biomart tbl
proteomics_tbl_meta_biomart_chrab <- wideFormat(multiomics_MAE[,,"chrom_abber"]) %>% as_tibble()
#proteomics_tbl_meta_biomart_chrab[, -1] <- proteomics_tbl_meta_biomart_chrab[, -1] > 20
proteomics_tbl_meta_biomart_chrab <- mutate_if(proteomics_tbl_meta_biomart_chrab, is.numeric, as.logical)
proteomics_tbl_meta_biomart_chrab <- proteomics_tbl_meta_biomart_chrab %>%  dplyr::select(primary, 
                colnames(proteomics_tbl_meta_biomart_chrab[, -1])[
                  apply(proteomics_tbl_meta_biomart_chrab[, -1] ,2, function(cl){ all(table(cl)>2) } )
                  ] )
proteomics_tbl_meta_biomart_chrab <- proteomics_tbl_meta_biomart_chrab %>%  dplyr::select(primary, 
                colnames(proteomics_tbl_meta_biomart_chrab[, -1])[
                  apply(proteomics_tbl_meta_biomart_chrab[, -1] ,2, function(cl){ length(table(cl))>1 } )
                  ] )

proteomics_tbl_meta_biomart_SNP <- wideFormat(multiomics_MAE[,,"SNPs"]) %>% as_tibble()
#proteomics_tbl_meta_biomart_SNP[, -1] <- (proteomics_tbl_meta_biomart_SNP[, -1] == 1)
proteomics_tbl_meta_biomart_SNP <- mutate_if(proteomics_tbl_meta_biomart_SNP, is.numeric, as.logical)
proteomics_tbl_meta_biomart_SNP <- proteomics_tbl_meta_biomart_SNP %>%  dplyr::select(primary, 
                colnames(proteomics_tbl_meta_biomart_SNP[, -1])[
                  apply(proteomics_tbl_meta_biomart_SNP[, -1] ,2, function(cl){ all(table(cl)>2) } )
                  ] )
proteomics_tbl_meta_biomart_SNP <- proteomics_tbl_meta_biomart_SNP %>%  dplyr::select(primary, 
                colnames(proteomics_tbl_meta_biomart_SNP[, -1])[
                  apply(proteomics_tbl_meta_biomart_SNP[, -1] ,2, function(cl){ length(table(cl))>1 } )
                  ] )

proteomics_tbl_meta_biomart_health <- wideFormat(multiomics_MAE[,,"health_record_bin"]) %>% as_tibble() %>% dplyr::select(primary, health_record_bin_IGHV_mutated, health_record_bin_elderly_at_diagnosis, health_record_bin_treated)
proteomics_tbl_meta_biomart_health <- mutate_if(proteomics_tbl_meta_biomart_health, is.numeric, as.logical)

proteomics_tbl_meta_biomart <- left_join(
  left_join((longFormat(multiomics_MAE[,,"proteomics"], colDataCols = c("trisomy12", "IGHV") ) %>% as_tibble() %>%
                                            mutate(IGHV= if_else(IGHV %in% c("M", "U"), IGHV, "NA") ) ), 
                                         metadata(multiomics_MAE)$gene_symbol_mapping[c(2,3,5:7)], 
                                         by=c("rowname"="hgnc_symbol")),
  proteomics_tbl_meta_biomart_chrab, by=c("primary"))
proteomics_tbl_meta_biomart <- left_join( left_join(proteomics_tbl_meta_biomart,
          proteomics_tbl_meta_biomart_SNP, by=c("primary")),
          proteomics_tbl_meta_biomart_health, by=c("primary"))
Create Drug and Proteomics objects
drug_and_proteomics_prot <- longFormat(multiomics_MAE[,,c("proteomics")]) %>% as_tibble()
drug_and_proteomics_drug <- longFormat(multiomics_MAE[,,c("drug_resp_mono")]) %>% as_tibble()
drug_and_proteomics_drug <- drug_and_proteomics_drug %>% separate(rowname, into = c("Drug", "conc"), sep = "_") %>% 
  group_by(assay, primary, rowname=Drug, colname ) %>%
  dplyr::summarise(value=mean(value, na.rm=TRUE)) %>% ungroup()
drug_and_proteomics <- bind_rows(drug_and_proteomics_prot, drug_and_proteomics_drug)

pats_drug_and_prot <- drug_and_proteomics %>% group_by(primary) %>% dplyr::summarise(nassay=length(unique(assay))) %>% dplyr::filter(nassay==2) %>% .$primary
drug_and_proteomics <- drug_and_proteomics %>% dplyr::filter(primary %in% pats_drug_and_prot)
all_prot <- drug_and_proteomics %>% dplyr::filter(assay=="proteomics", rowname %in% prot_few_nas) %>% .$rowname %>% unique()

all_drugs <- drug_and_proteomics %>% dplyr::filter(assay=="drug_resp_mono") %>% .$rowname %>% unique()

prot50 <- rownames(assay(multiomics_MAE[prot_few_nas , ,"proteomics"])[(assay(multiomics_MAE[prot_few_nas , ,"proteomics"]) %>% rowVars(na.rm = TRUE)) > Var50, ])
prot500 <- rownames(assay(multiomics_MAE[prot_few_nas , ,"proteomics"])[(assay(multiomics_MAE[prot_few_nas , ,"proteomics"]) %>% rowVars(na.rm = TRUE)) > Var500, ])
prot1000 <- rownames(assay(multiomics_MAE[prot_few_nas , ,"proteomics"])[(assay(multiomics_MAE[prot_few_nas , ,"proteomics"]) %>% rowVars(na.rm = TRUE)) > Var1000, ])

Overlapping patients proteomics and RNASeq

pat_overlap_prot_RNA <- colnames(intersectColumns(multiomics_MAE[,,c("proteomics", "RNAseq_norm")]))[["proteomics"]]

KEGG load gene sets BCR and spliceosome

BCR_genes <- read_excel(path = "/Users/sophierabe/Desktop/PhD/Bioinfo/Data/KEGG_BCellReceptorSignalingPathway.xlsx")
splice_genes <-read_table(file = "/Users/sophierabe/Desktop/PhD/Bioinfo/Data/KEGG_Spliceosome.txt", skip = 2, col_names = "symbol")
## Parsed with column specification:
## cols(
##   symbol = col_character()
## )

This is what the object looks like

proteomics_tbl_meta_biomart
as_tibble(colData(multiomics_MAE))
assay(multiomics_MAE[prot_few_nas , ,"proteomics"])[1:5,1:5]
##          OMZP0012     OMZP0042    OMZP0062    OMZP0094    OMZP0105
## A2M   -0.23106599 -0.123775317 -0.20324611 -0.11422224  0.05369119
## AAAS  -0.04446032 -0.003929938 -0.10881390  0.21728422 -0.24019198
## AACS   0.18091637 -0.469318906  0.01926965  0.46120353  0.27485862
## AAGAB  0.11577450  0.129642752  0.10785538 -0.06247763  0.25647801
## AAMDC  0.21443028 -0.003820597 -0.09936349  0.62120261 -0.26938714
dim(assay(multiomics_MAE[prot_few_nas , ,"proteomics"]))
## [1] 7311   68

Some numbers for the different datasets

print(paste( round( (!(assay(multiomics_MAE[,,"proteomics"]) %>% is.na() ) ) %>% colSums() %>% mean) , "proteins per sample were detected on average in proteomics data"))
## [1] "7311 proteins per sample were detected on average in proteomics data"
print(paste( round( (!(assay(multiomics_MAE[,,"RNAseq_full"]) %>% is.na() ) ) %>% 
                      colSums() %>% mean) , 
             "transcipts per sample were detected on average in RNASeq data"))
## [1] "63214 transcipts per sample were detected on average in RNASeq data"
if(!any(is.na(assay(multiomics_MAE[,,"RNAseq_full"])))){
  print("No NAs in RNASeq dataset")
}
## [1] "No NAs in RNASeq dataset"
print( paste( ( (assay(multiomics_MAE[,,"RNAseq_full"]) %>% rowMeans() ) > 0 ) %>% sum , "different transcripts were detected"))
## [1] "40083 different transcripts were detected"
print(
  paste(
    multiomics_MAE@metadata$gene_symbol_mapping %>% 
  filter(ensembl_gene_id %in%
           names((assay(multiomics_MAE[,,"RNAseq_full"]) %>% rowMeans() ) > 0 ) ) %>%
  dplyr::select(hgnc_symbol) %>% unique %>%
  filter(hgnc_symbol != "") %>% 
  nrow, 
  "transcripts with unique hgnc symbols"))
## [1] "35402 transcripts with unique hgnc symbols"
print( paste(
(multiomics_MAE@metadata$gene_symbol_mapping %>% 
  filter(ensembl_gene_id %in%
           names((assay(multiomics_MAE[,,"RNAseq_full"]) %>% rowMeans() ) > 0 ) ) %>%
  dplyr::select(hgnc_symbol) %>% unique %>%
  filter(hgnc_symbol != "") %>% .$hgnc_symbol %in%
rownames(multiomics_MAE@ExperimentList$proteomics) ) %>% sum,
"matching proteins and transcripts detected"))
## [1] "7199 matching proteins and transcripts detected"
protpats <- multiomics_MAE@sampleMap %>% as_tibble() %>% filter(assay=="proteomics") %>% .$primary %>% unique
message("Available datasets for proteomics patients:")
multiomics_MAE[,protpats,]@sampleMap %>% .$assay %>% table
## .
##                 SNPs       drug_resp_mono         drug_resp_co 
##                   67                   68                   68 
##          chrom_abber    health_record_bin health_record_scaled 
##                   68                   68                   68 
##           proteomics          RNAseq_full          RNAseq_norm 
##                   68                   59                   59

Oncoplot

order_oncoplot <- longFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% 
  as_tibble() %>% 
  group_by(rowname) %>%
  summarise(total=sum(value, na.rm = TRUE)) %>%
  arrange(total ) %>%
  .$rowname

tmp <- wideFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% as_tibble() 
colnames(tmp) <- colnames(tmp) %>% gsub("SNPs_|chrom_abber_", "",.)
#tmp %>% arrange( desc( del13q14 ) )
for(l in 1:length(order_oncoplot)){
  tmp <- tmp %>% arrange_( as.name(order_oncoplot[l] ) )
}


onco_center <- 
  longFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% 
  as_tibble() %>% 
  mutate(alteration= if_else(value==1, assay, as.character(value))) %>%
  ggplot(aes(primary, rowname, fill=alteration)) + 
  geom_tile(color="black") + 
  scale_fill_manual(values=c("white", "orange1", "#ca0020", "grey"), labels=c("wt", "CNV", "SNV", "NA" ), na.translate=FALSE) + 
  #pp_sra_noguides_tilted +
  scale_y_discrete(limits=order_oncoplot) +
  scale_x_discrete(limits=rev(tmp$primary)) +
  theme(panel.background = element_rect(fill= "darkgrey"), panel.grid = element_blank(), 
        axis.text.x = element_blank(), axis.title = element_blank(),
        aspect.ratio = 1, axis.ticks.x = element_blank(), legend.position = "bottom") +
  guides(fill=guide_legend(title = NULL ))

onco_right <-longFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% 
  as_tibble() %>%
  ggplot(aes(rowname, value)) + geom_col(aes(fill=assay)) +
  scale_x_discrete(limits=order_oncoplot) +
  coord_flip()+
  scale_fill_manual(values=c("orange1", "#ca0020")) +
  theme(panel.background = element_rect(fill= "white"), panel.grid = element_blank(), axis.title = element_blank(),
        aspect.ratio = 1, axis.line.x = element_line(color="black")) 

onco_right_perc <- longFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% as_tibble()  %>% 
  group_by(rowname) %>% 
  dplyr::summarise(perc=round(sum(value, na.rm = TRUE)/ sum(!is.na(value)) , 2) ) %>% 
  ggplot(aes(rowname, 1, label=paste0(perc*100, "%") )) + 
  geom_text() + coord_flip() + theme_void() + 
  scale_x_discrete(limits=order_oncoplot)

onco_right_total <- 
  longFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% as_tibble()  %>% 
  group_by(rowname) %>% 
  dplyr::summarise(total=sum(value, na.rm = TRUE )) %>% 
  ggplot(aes(rowname, 1, label=total)) + 
  geom_text() + coord_flip() + theme_void() + 
  scale_x_discrete(limits=order_oncoplot)

onco_top <-longFormat(multiomics_MAE[,,c("SNPs","chrom_abber")]) %>% 
  as_tibble() %>%
  ggplot(aes(primary, value)) + geom_col(aes(fill=assay)) +
  scale_x_discrete(limits=rev(tmp$primary)) +
  scale_fill_manual(values=c("orange1", "#ca0020")) +
  theme(panel.background = element_rect(fill= "white"), panel.grid = element_blank(), axis.title = element_blank(),
        aspect.ratio = 1, axis.line.x = element_line(color="black"))

p1<- insert_yaxis_grob(onco_center,get_panel(onco_right_total) , grid::unit(.2, "null"), position = "right")
p1.2<- insert_yaxis_grob(p1, get_panel(onco_right), grid::unit(.1, "null"), position = "right")
p1.3<- insert_yaxis_grob(p1.2, get_panel(onco_right_perc), grid::unit(.1, "null"), position = "right")
oncoplot <- insert_xaxis_grob(p1.3, onco_top, grid::unit(.2, "null"), position = "top")
ggdraw(oncoplot)

Drug classes

drs_sel <- metadata(multiomics_MAE)$drugs_functional_groups %>% 
  filter( !( grepl("\\+|DMSO|SSZ|phenyleth|Oxaliplatin|Hydroxychloroquine|Obatoclax mesylate|Vitamin", Drug)   ) ) 
order_drugs <- drs_sel$Pathways %>% table() %>% as_tibble() %>% arrange(desc(n)) %>% .$.
drug_nrs <- drs_sel %>%
  ggplot(aes(Pathways)) + geom_bar() + 
  pp_sra_noguides_tilted +
  scale_x_discrete(limits=order_drugs)

drug_nrs

#plot_grid(oncoplot, drug_nrs, align = "v", axis="t")

Number of proteins per patient

(!is.na(multiomics_MAE@ExperimentList$proteomics))  %>% 
  colSums() %>% 
  enframe(name = "Patients", value = "Nr. of proteins") %>%
  ggplot(aes(Patients, `Nr. of proteins`)) + geom_col() + pp_sra + theme(axis.text.x = element_blank()) 

Save the workspace for further scripts

save(all_drugs,
     BCR_genes, splice_genes,
     drug_and_proteomics, drug_and_proteomics_drug,
     multiomics_MAE,
     pat_overlap_prot_RNA,
     prot_few_nas,
     prot1000, prot500, prot50,
     proteomics_tbl_meta_biomart, proteomics_tbl_meta_biomart_chrab, proteomics_tbl_meta_biomart_health, proteomics_tbl_meta_biomart_SNP,
     Var1000, Var500, Var50,
      file="/Volumes/sd17b003/Sophie/Analysis/CLL_Proteomics/CLL_Proteomics_final/Proteomics_Git/Robjects/CLL_Proteomics_Setup.RData")

Session Info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] cowplot_1.0.0               survminer_0.4.6            
##  [3] ConsensusClusterPlus_1.46.0 RColorBrewer_1.1-2         
##  [5] gplots_3.0.1.1              PMA_1.1                    
##  [7] MultiAssayExperiment_1.8.3  biomaRt_2.38.0             
##  [9] biomartr_0.9.0              Rtsne_0.15                 
## [11] IHW_1.10.1                  apeglm_1.4.2               
## [13] limma_3.38.3                fdrtool_1.2.15             
## [15] vsn_3.50.0                  forcats_0.4.0              
## [17] stringr_1.4.0               dplyr_0.8.3                
## [19] purrr_0.3.3                 readr_1.3.1                
## [21] tidyr_1.0.0                 tibble_2.1.3               
## [23] tidyverse_1.3.0             TOSTER_0.3.4               
## [25] DESeq2_1.22.2               SummarizedExperiment_1.12.0
## [27] DelayedArray_0.8.0          BiocParallel_1.16.6        
## [29] matrixStats_0.55.0          Biobase_2.42.0             
## [31] GenomicRanges_1.34.0        GenomeInfoDb_1.18.2        
## [33] IRanges_2.16.0              S4Vectors_0.20.1           
## [35] BiocGenerics_0.28.0         readxl_1.3.1               
## [37] ggrepel_0.8.1               ggpmisc_0.3.3              
## [39] ggfortify_0.4.8             ggbeeswarm_0.6.0           
## [41] effsize_0.7.6               ggpubr_0.2.4               
## [43] magrittr_1.5                lemon_0.4.3                
## [45] Hmisc_4.3-0                 ggplot2_3.2.1              
## [47] Formula_1.2-3               survival_3.1-8             
## [49] lattice_0.20-38             Matrix_1.2-18              
## [51] progress_1.2.2              reshape2_1.4.3             
## [53] pheatmap_1.0.12             openxlsx_4.1.4             
## [55] gtools_3.8.1                plyr_1.8.4                 
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.5        lazyeval_0.2.2         splines_3.5.2         
##   [4] lpsymphony_1.10.0      digest_0.6.23          htmltools_0.4.0       
##   [7] gdata_2.18.0           fansi_0.4.0            checkmate_1.9.4       
##  [10] memoise_1.1.0          cluster_2.1.0          Biostrings_2.50.2     
##  [13] annotate_1.60.1        modelr_0.1.5           prettyunits_1.0.2     
##  [16] colorspace_1.4-1       blob_1.2.0             rvest_0.3.5           
##  [19] haven_2.2.0            xfun_0.11              crayon_1.3.4          
##  [22] RCurl_1.95-4.12        jsonlite_1.6           genefilter_1.64.0     
##  [25] impute_1.56.0          zeallot_0.1.0          zoo_1.8-6             
##  [28] glue_1.3.1             gtable_0.3.0           zlibbioc_1.28.0       
##  [31] XVector_0.22.0         scales_1.1.0           DBI_1.0.0             
##  [34] Rcpp_1.0.3             emdbook_1.3.11         xtable_1.8-4          
##  [37] htmlTable_1.13.3       foreign_0.8-72         bit_1.1-14            
##  [40] km.ci_0.5-2            preprocessCore_1.44.0  htmlwidgets_1.5.1     
##  [43] httr_1.4.1             ellipsis_0.3.0         acepack_1.4.1         
##  [46] pkgconfig_2.0.3        XML_3.98-1.20          nnet_7.3-12           
##  [49] dbplyr_1.4.2           locfit_1.5-9.1         tidyselect_0.2.5      
##  [52] rlang_0.4.2            AnnotationDbi_1.44.0   munsell_0.5.0         
##  [55] cellranger_1.1.0       tools_3.5.2            cli_2.0.0             
##  [58] generics_0.0.2         RSQLite_2.1.4          broom_0.5.2           
##  [61] evaluate_0.14          yaml_2.2.0             knitr_1.26            
##  [64] bit64_0.9-7            fs_1.3.1               zip_2.0.4             
##  [67] survMisc_0.5.5         caTools_1.17.1.3       nlme_3.1-142          
##  [70] slam_0.1-46            xml2_1.2.2             compiler_3.5.2        
##  [73] rstudioapi_0.10        curl_4.3               beeswarm_0.2.3        
##  [76] affyio_1.52.0          ggsignif_0.6.0         reprex_0.3.0          
##  [79] geneplotter_1.60.0     stringi_1.4.3          KMsurv_0.1-5          
##  [82] vctrs_0.2.0            pillar_1.4.2           lifecycle_0.1.0       
##  [85] BiocManager_1.30.10    data.table_1.12.8      bitops_1.0-6          
##  [88] R6_2.4.1               latticeExtra_0.6-28    affy_1.60.0           
##  [91] KernSmooth_2.23-16     gridExtra_2.3          vipor_0.4.5           
##  [94] MASS_7.3-51.4          assertthat_0.2.1       withr_2.1.2           
##  [97] GenomeInfoDbData_1.2.0 hms_0.5.2              rpart_4.1-15          
## [100] coda_0.19-3            rmarkdown_1.18         bbmle_1.0.20          
## [103] numDeriv_2016.8-1.1    lubridate_1.7.4        base64enc_0.1-3